{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Read File and make files ready for prediction adjustment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# -------Import Necessary Files --------------------------------------\n",
    "import numpy as np \n",
    "import pandas as pd "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ---------Read Files --------------------------------------------------\n",
    "train    = pd.read_csv('train.csv')\n",
    "test     = pd.read_csv('test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(15285, 12)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(12140, 11)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# ----------------- Drop Duplicates-------------------------------------\n",
    "train = train.drop_duplicates()  # Drop Duplicates \n",
    "train = train.reset_index(drop=True)  # Reset Index\n",
    "display(train.shape,test.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Prediction Adjuster \n",
    "\n",
    "**`Steps:-`**\n",
    "\n",
    "* Run Train loop\n",
    "* Run Test loop \n",
    "* Check overlapped predictions \n",
    "* Adjust the same predictions in cg dataframe\n",
    "* Save file as common_grounds.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "at row 0\n",
      "at row 2000\n",
      "at row 4000\n",
      "at row 6000\n",
      "at row 8000\n",
      "at row 10000\n",
      "at row 12000\n",
      "at row 14000\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>popularity</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    popularity\n",
       "0            3\n",
       "4            4\n",
       "6            4\n",
       "15           1\n",
       "16           3"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## ---------------------------- Prediction Adjuster -------------------------------------\n",
    "#  Create Common_grounds.csv file which will be our prediction adjuster file\n",
    "# In this way ,we are able to retain predictions which has overlapped data in train and test\n",
    "\n",
    "target = train['popularity']\n",
    "mydata1 = train.drop('popularity', axis = 1)\n",
    "mydata2 = test.copy()\n",
    "\n",
    "common_grounds = pd.DataFrame(index = range(len(mydata2)), columns = ['popularity'])\n",
    "columns = list(mydata1.columns)\n",
    "\n",
    "for idx1 in range(len(mydata1)): # train Loop\n",
    "    if idx1%5000 == 0:\n",
    "        print(f'at row {idx1}')\n",
    "    for idx2 in range(len(mydata2)): # test loop\n",
    "        for colum in columns:\n",
    "            if mydata2.loc[idx2, colum] != mydata1.loc[idx1, colum]:\n",
    "                #idx2 = idx2+1\n",
    "                break\n",
    "            elif colum == mydata1.columns[-1:]: \n",
    "\n",
    "                common_grounds.iloc[idx2] = target.iloc[idx1]\n",
    "common_grounds.dropna(axis = 0, how ='all', inplace = True) \n",
    "#display(len(common_grounds), common_grounds.head())\n",
    "common_grounds.to_csv('common_grounds.csv')\n",
    "cg = pd.read_csv('common_grounds.csv', index_col=0)\n",
    "cg.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4    2697\n",
       "3     458\n",
       "5      93\n",
       "1      82\n",
       "0       4\n",
       "Name: popularity, dtype: int64"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cg = pd.read_csv('common_grounds.csv', index_col=0)\n",
    "cg['popularity'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<span style=\"font-family: Arial; font-weight:bold;font-size:1.9em;color:#f97102;\"> -------------------------End of File---------------------------------"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
